# Prepare pre-pandemic returns
# Note: clean and combine returns from pre-pandemic elections
# By Dom Valentino and Chris Kenny
# libs --------------------------------------------------------------------
library(tidyverse) # mutate(), filter(), select(), etc.
library(readxl) # read_excel()
library(cdlTools) # fips()

# table of contents -------------------------------------------------------
# returns: clean and combine election return data
# turnout: clean and combine election turnout data
# merge: merge together data with citizen voting age population data
# codebook: codebook for final data set

# returns -----------------------------------------------------------------
returns_files <- list.files('../data/leip_returns') # read data
columns_returns <- c('state', 'county', 'fips', 'year', 'office', 'dem_share', 'treat') # column names for container
final_returns <- as.data.frame(matrix(NA, ncol = length(columns_returns))) # create container
colnames(final_returns) <- columns_returns # assign colnames
# read and clean all files with a loop
for(i in 1:length(returns_files)) {
  temp <- read.csv(paste0('../data/leip_returns/', returns_files[i]))[-1, ] # read in files
  parse <- str_split(returns_files[i], "_", n = Inf, simplify = FALSE) # split string to get metadata
  # create local container for results
  data <- as.data.frame(matrix(NA, ncol = length(columns_returns), nrow = nrow(temp)))
  colnames(data) <- columns_returns
  # fill container
  data <- data %>% mutate(state = fips(substr(str_pad(as.character(temp$FIPS), 5, "left", "0"), start = 1, stop = 2), to = "Abbreviation"),
                          county = temp$Geographic.Name,
                          fips = str_pad(temp$FIPS, 5, "left", "0"),
                          year = parse[[1]][5], # get year from filename
                          dem_share = as.numeric(temp[, 5]) / (as.numeric(temp[, 5]) + as.numeric(temp[, 6])),
                          office = case_when( # get office
                            parse[[1]][2] == "Governor" ~ "dem_share_gov",
                            parse[[1]][2] == "Presidential" ~ "dem_share_pres",
                            parse[[1]][2] == "Senate" ~ "dem_share_sen")) %>% 
    mutate(treat = case_when(
      state == "CO" & year >= 2013 ~ 1,
      state == "OR" & year >= 2000 ~ 1,
      T ~ 0
    )) %>% 
    filter(!(state %in% c("CA", "UT", "WA"))) # excluding states from thompson et al.
  # add to final dataset
  final_returns <- rbind(final_returns, data)
}

final_returns_w <- final_returns[-1, ] %>% pivot_wider(names_from = "office", values_from = "dem_share") # pivot so each row is a county-year
final_returns <- final_returns_w %>% unnest(cols = c(dem_share_gov, dem_share_pres, dem_share_sen))

# turnout -----------------------------------------------------------------
turnout_files <- list.files('../data/leip_turnout')
columns_turnout <- c('state', 'county', "fips", 'year', 'ballots_cast') # column names for container
final_turnout <- as.data.frame(matrix(NA, ncol = length(columns_turnout))) # initialize container
colnames(final_turnout) <- columns_turnout # assign colnames
# read and clean all files with a loop
for(i in 1:length(turnout_files)) {
  parse <- str_split(turnout_files[i], "_|\\.", n = Inf, simplify = FALSE) # split string to get metadata
  suppressMessages(temp <- read_excel(paste0('../data/leip_turnout/', turnout_files[i]), sheet = "County VTO")) # read in files
  colnames(temp)[1:2] <- c("county", "state")
  temp <- temp %>% mutate(year = parse[[1]][3], FIPS = str_pad(FIPS, 5, "left", "0")) %>% # get year from filename
    dplyr::select("state", "county", "fips" = "FIPS", "year", "ballots_cast" = "Ballots Cast") %>% # select variables and change their names
    filter(!(as.numeric(fips) < 100 | fips == "15099")) %>% # original file is funky, so this removes NAs and other artifacts
    filter(state %in% c("MT", "CO", "NV", "VT", "NJ", "HI")) # excluding states from thompson et al.
  # add to final dataset
  final_turnout <- rbind(final_turnout[!is.na(final_turnout$fips), ], temp)
}

# do some clean up
final_turnout <- final_turnout %>% filter(!(state %in% c("NJ", "MT") & year == 2010)) # no statewide elections in 2010 but still have turnout for lower offices - remove
final_returns <- final_returns %>% filter(!(state == "NJ" & as.numeric(year) %% 2 != 0)) # remove off-year NJ gubernatorial elections

# merge -------------------------------------------------------------------
cvap <- read_dta("../data/modified data/county_cvap.dta") # read citizen voting age population data (from thompson et al.)

# merge cvap with final_turnout, then with final_results
final <- final_turnout %>% mutate(year = as.double(year)) %>% 
  left_join(., cvap, by = c("state", "county", "year")) %>% # we are missing 9 cvap county-years corresponding to Broomfield county in CO
  mutate(year = as.character(year)) %>% 
  left_join(., final_returns, by = c("state", "county", "fips", "year")) %>% 
  mutate(turnout_share = as.numeric(ballots_cast) / as.numeric(cvap_approx))

write_csv(final, "../data/analysis/analysis_pre-pandemic.csv")
saveRDS(final_returns, "../data/returns_pre/returns_pre.Rds")

# codebook (all variables are the county-year level) ----------------------
# state: state abbreviation
# county: full county name
# fips: 5 digit county fips code
# year: election year
# ballots_cast: total number of ballots case for any office
# cvap: citizen voting age population (from thompson et al.)
# cvap_moe: margin of error on citizen voting age population (from thompson et al.)
# cvap_approx: interpolated/extrapolated citizen voting age population (from thompson et al.)
# treat: treatment indicator
# dem_share_gov: two-party Democratic vote share for governor (if applicable)
# dem_share_pres: two-party Democratic vote share for president (if applicable)
# dem_share_sen: two-party Democratic vote share for senate (if applicable)
# turnout_share: proportion of cvap who cast a ballot
